library(Rsamtools)
library(GenomicRanges)
library(readr)
library(gplots)
library(viridis)
library(pastecs) 

#bam file names
file_name = c("wt_pulse","wt_c10","wt_c15","wt_c20","wt_c40",
              "cac_pulse","cac_c10","cac_c15","cac_c20","cac_c40")
file_extension= c("_sampled","_sampled","_sampled","_sampled","_sampled",
                  "_sampled","_sampled","_sampled","_sampled","_sampled")

#define the read boudaries (nucleosomal)
length_min = 140
length_max = 180

#set bandwith for the Gaussian kernel
band=30

#setting up parameters
options(scipen=999) #disabling scientific notation 1= "230218",
chr_coordinates.df = data.frame(chr=(c("chrI", "chrII", "chrIII", "chrIV", "chrV", "chrVI", "chrVII", "chrVIII", "chrIX", "chrX", "chrXI", "chrXII",
                                       "chrXIII", "chrXIV", "chrXV","chrXVI")),
                                end=as.numeric(c("230218","813184", "316620", "1531933", "576874", "270161", "1090940", "562643", "439888", "745751", "666816", "1078177", "924431", "784333", "1091291", "948066")), stringsAsFactors=FALSE)
#data frame to store nuc data 
nuc_master = data.frame(matrix(ncol = 12, nrow = 0))
cn <- c("chr", "peak", "wt_p","wt_c10","wt_c15","wt_c20","wt_c40",
        "cac_p","cac_c10","cac_c15","cac_c20","cac_c40")
colnames(nuc_master) <- cn

index_file = chr_coordinates.df

# this is  to create a list of matrices to be able to call them individually later on.
data = vector("list")

#get nucleosome peak and occupancy 
count = 1
for(r in 1:nrow(index_file)){
  chr = index_file[r,"chr"] 
  
  new_start= 1
  new_end= index_file[r, "end"] 
  
  chr.gr = GRanges(seqnames= chr, ranges = IRanges(start =new_start , end = new_end ))
  
  p = ScanBamParam(what = c("rname", "strand", "pos", "isize"),which = chr.gr)
  
  
  for (f in 1:10){
    #data files
    file_name.bam = (paste("/data/bam_bai/",file_name[f],file_extension[f],".bam", sep=''))
    file_name.bam.bai = paste("/data/bam_bai/",file_name[f],file_extension[f],".bam.bai",sep='')  
    
    A_reads.l = scanBam(file = file_name.bam,
                        index = file_name.bam.bai,
                        param = p)
    
    #create a new GenomicRanges object for the reads from this list:
    A_reads.gr = GRanges(seqnames = A_reads.l[[1]]$rname,
                         ranges = IRanges(start = A_reads.l[[1]]$pos,
                                          width = A_reads.l[[1]]$isize))
    #only count nucleosome-sized reads
    subset_data.gr = A_reads.gr[which(width(A_reads.gr) > length_min & width(A_reads.gr)< length_max)]
    
    #finding the mipoints of those reads. 
    midpoints.gr =IRanges(start=mid(ranges(subset_data.gr)), width=1) 
    
    data[[f]] = as.data.frame(subset_data.gr)
    midpoints = as.data.frame(midpoints.gr)
    data[[f]]$mid=midpoints$start
    
    # this is for sites that have zero reads and could affect the density calculation 
    if(length(data[[f]]$mid) == 0){
      data[[f]] = data.frame(matrix(0, nrow=new_end-new_start, ncol=1))
      colnames(data[[f]]) = "mid"
      data[[f]][is.na(data[[f]])] = 0
    }
    
  }
  
  #exit loop if no reads
  if(nrow(midpoints) == 0){
    next
  }
  
  cat(paste("saving midpoints on chromosome", r,"\n"))
  
  #nuc's called from WT 40 are used as references for nucleosome dyad positions
  dm <- data[[5]]$mid

  #calculate turnpoints PEAKS
  myDensity = density(dm, bw=band, kernel="gaussian", n=(new_end/5))
  
  tp = turnpoints(myDensity$y)
  
  #finding all of the peaks within the range 
  
  if (tp$firstispeak){
    d_peaks=myDensity$x[tp$tppos[seq(1,max(tp$tppos),by=2)]]
    d_peak_scores=myDensity$y[tp$tppos[seq(1,max(tp$tppos),by=2)]]
    print("first is a peak ")
  } else {
    d_peaks=myDensity$x[tp$tppos[seq(0,max(tp$tppos),by=2)]]
    d_peak_scores=myDensity$y[tp$tppos[seq(0,max(tp$tppos),by=2)]]
    print("first is NOT a peak ")
  }
  
  d_peak_scores[which(is.na(d_peak_scores))]=0
  d_peaks=d_peaks[which(d_peak_scores>1e-8)]
  
  print(length(d_peaks))

  for(i in 1:length(d_peaks)){
    cat(paste("saving nucleosome", i,"on chromosome", r,"\n"))
    
    nuc_master[count,1]=chr
    nuc_master[count,2]=d_peaks[i]
    
    #define 140bp nuc window
    cstart = d_peaks[i] - 70
    cend = d_peaks[i] + 70
    
    for(f in 1:10){
     nuc_master[count,2+f]=length(which(data[[f]]$mid > cstart & data[[f]]$mid < cend))
    }
    count = count+1
  }
  
}  

#associate nucleosomes with transcription level (for fig2 and fig3)
all_genes = read_csv('~/data/feature_file/gene_master.csv')
nuc_master$position = 'Intergenic'
for (i in 1:nrow(all_genes)) {
  a=which(nuc_master$chr == all_genes$chr[i] & nuc_master$peak > all_genes$start[i] & nuc_master$peak < all_genes$end[i])
  nuc_master$position[a]=all_genes$txn_quintile[i]
}

#calculate assembly timing index (ATI) for all nucleosomes
nuc_master$wt_ATI = NA
nuc_master$cac_ATI = NA

#ATI defined as the reverse of mean weighted sum of nuc occupancy across the time course
calc_ATI <- function(dat){
  dat2 <- dat/sum(dat)
  tmp <- 1/(((1*dat2[1]+2*dat2[2]+3*dat2[3]+4*dat2[4]+5*dat2[5]))/(5))
  return(tmp)
}

for (i in 1:nrow(nuc_master)) {
  wt_v = as.vector(as.matrix(nuc_master[i,3:7]))
  cac_v = as.vector(as.matrix(nuc_master[i,8:12]))
  nuc_master$wt_ATI[i] = calc_ATI(wt_v)
  nuc_master$cac_ATI[i]= calc_ATI(cac_v)
  
  if( i %% 500 ==0){
    cat(paste('nuc # ',i,'\n'))
  }
}


write.csv(nuc_master,'~/data/nuc_master.csv',row.names = FALSE)



